001 /*
002 Copyright (c) 2012, Regents of the University of Colorado
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without modification,
006 are permitted provided that the following conditions are met:
007
008 * Redistributions of source code must retain the above copyright notice, this
009 list of conditions and the following disclaimer.
010
011 * Redistributions in binary form must reproduce the above copyright notice,
012 this list of conditions and the following disclaimer in the documentation
013 and/or other materials provided with the distribution.
014
015 * Neither the name of the University of Colorado nor the names of its
016 contributors may be used to endorse or promote products derived from this
017 software without specific prior written permission.
018
019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030 package edu.ucdenver.ccp.nlp.biolemmatizer.uima;
031
032 import java.lang.reflect.Constructor;
033 import java.lang.reflect.InvocationTargetException;
034 import java.lang.reflect.Method;
035 import java.util.ArrayList;
036 import java.util.Iterator;
037 import java.util.List;
038
039 import org.apache.uima.UimaContext;
040 import org.apache.uima.analysis_engine.AnalysisEngine;
041 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
042 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
043 import org.apache.uima.cas.CAS;
044 import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
045 import org.apache.uima.fit.descriptor.ConfigurationParameter;
046 import org.apache.uima.fit.factory.AnalysisEngineFactory;
047 import org.apache.uima.fit.factory.ConfigurationParameterFactory;
048 import org.apache.uima.jcas.JCas;
049 import org.apache.uima.jcas.tcas.Annotation;
050 import org.apache.uima.resource.ResourceInitializationException;
051 import org.apache.uima.resource.metadata.TypeSystemDescription;
052
053 import edu.ucdenver.ccp.nlp.biolemmatizer.BioLemmatizer;
054 import edu.ucdenver.ccp.nlp.biolemmatizer.LemmataEntry;
055 import edu.ucdenver.ccp.uima.shims.annotation.AnnotationDataExtractor;
056 import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.LemmaDecorator;
057 import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeechDecorator;
058
059 /**
060 * This annotator processes tokens in the CAS and inserts corresponding lemmas. This annotator is
061 * type-system-independent and relies on implementations of TokenAttributeExtractor,
062 * TokenAttributeInserter, and AnnotationDataExtractor in order to function as intended.
063 *
064 * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu
065 *
066 */
067 public class BioLemmatizer_AE extends JCasAnnotator_ImplBase {
068
069 /**
070 * Parameter name used in the UIMA descriptor file for the token type
071 */
072 public static final String PARAM_TOKEN_TYPE_NAME = "tokenTypeName";
073
074 /**
075 * The token type to use. This parameter serves a dual purpose. It can be used to represent the
076 * class name of the token annotation type to retrieve from the CAS, e.g.
077 * org.apache.uima.examples.tokenizer.Token. When this is the case, annotations of this type
078 * will be processed, and lemmas for the text covered by these annotations will be determined.
079 * The second use for this configuration parameter is when it is used in conjunction with an
080 * {@link AnnotationDataExtractor}. The
081 * {@link AnnotationDataExtractor#getAnnotationType(Annotation)} method returns a {@link String}
082 * and when the value of that returned {@link String} equals the value of the tokenTypeName,
083 * then that annotation will be treated as a token and its covered text will be lemmatized.
084 */
085 @ConfigurationParameter(mandatory = true, description = "")
086 private String tokenTypeName;
087
088 /**
089 * Parameter name used in the UIMA descriptor file for the name of the method that returns a
090 * part-of-speech (as a String) from the token annotation
091 */
092 public static final String PARAM_TOKEN_GET_POS_METHOD_NAME = "tokenGetPosMethodName";
093
094 /**
095 * This is an optional parameter. It is used in conjunction with the tokenTypeName when that
096 * parameter represents a class name, i.e. when it represents the name of token annotation
097 * classes in the CAS. The tokenGetPosMethodName should be the name of the method in the
098 * tokenTypeName class that returns the part-of-speech tag. <br>
099 * <br>
100 * If this field is not set, then either the input tokens do not have part-of-speech information
101 * associated with them, or the tokenTypeName configuration parameter is not the name of an
102 * annotation class, but is instead a type as in the second scenario described above.
103 */
104 @ConfigurationParameter(mandatory = false, description = "")
105 private String tokenGetPosMethodName;
106
107 /**
108 * Parameter name used in the UIMA descriptor file for the {@link PartOfSpeechDecorator}
109 * implementation to use
110 */
111 public static final String PARAM_POS_DECORATOR_CLASS = "posDecoratorClassName";
112
113 /**
114 * The name of the {@link PartOfSpeechDecorator} implementation to use
115 */
116 @ConfigurationParameter(mandatory = false, description = "name of the PartOfSpeechDecorator implementation to use")
117 private String posDecoratorClassName;
118
119 /**
120 * The {@link PartOfSpeechDecorator} that will be initialized to the class specified by the
121 * {@link #posDecoratorClassName} configuration parameter
122 */
123 private PartOfSpeechDecorator posDecorator;
124
125 /**
126 * Parameter name used in the UIMA descriptor file for the {@link LemmaDecorator} implementation
127 * to use
128 */
129 public static final String PARAM_LEMMA_DECORATOR_CLASS = "lemmaDecoratorClassName";
130
131 /**
132 * The name of the {@link LemmaDecorator} implementation to use
133 */
134 @ConfigurationParameter(mandatory = true, description = "name of the LemmaDecorator implementation to use", defaultValue = "edu.ucdenver.ccp.nlp.biolemmatizer.uima.DefaultLemmaDecorator")
135 private String lemmaDecoratorClassName;
136
137 /**
138 * The {@link LemmaDecorator} that will be initialized to the class specified by the
139 * {@link #lemmaDecoratorClassName} configuration parameter
140 */
141 private LemmaDecorator lemmaDecorator;
142
143 /**
144 * Parameter name used in the UIMA descriptor file for the annotation data extractor
145 * implementation to use
146 */
147 public static final String PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS = "annotationDataExtractorClassName";
148
149 /**
150 * The name of the {@link AnnotationDataExtractor} implementation to use
151 */
152 @ConfigurationParameter(mandatory = true, description = "name of the AnnotationDataExtractor implementation to use", defaultValue = "edu.ucdenver.ccp.uima.shims.annotation.impl.DefaultAnnotationDataExtractor")
153 private String annotationDataExtractorClassName;
154
155 /**
156 * this {@link AnnotationDataExtractor} will be initialized based on the class name specified by
157 * the annotationDataExtractorClassName parameter
158 */
159 private AnnotationDataExtractor annotationDataExtractor;
160
161 /**
162 * This {@link BioLemmatizer} will do the bulk of the work in the
163 * {@link BioLemmatizer_AE#process(JCas)} method
164 */
165 private BioLemmatizer bioLemmatizer;
166
167 /**
168 * Initializes the {@link BioLemmatizer} that will be used by the
169 * {@link BioLemmatizer_AE#process(JCas)} method
170 *
171 * @see org.uimafit.component.JCasAnnotator_ImplBase#initialize(org.apache.uima.UimaContext)
172 */
173 @Override
174 public void initialize(UimaContext context) throws ResourceInitializationException {
175 super.initialize(context);
176 bioLemmatizer = new BioLemmatizer();
177 lemmaDecorator = (LemmaDecorator) invokeNoArgsConstructor(lemmaDecoratorClassName);
178 annotationDataExtractor = (AnnotationDataExtractor) invokeNoArgsConstructor(annotationDataExtractorClassName);
179 if (posDecoratorClassName != null) {
180 posDecorator = (PartOfSpeechDecorator) invokeNoArgsConstructor(posDecoratorClassName);
181 }
182 }
183
184 /**
185 * Returns an instantiation of the class specified by the input {@link String}. Assumes default
186 * constructor, i.e. no arguments.
187 *
188 * @param className
189 * @param arguments
190 * @return
191 */
192 public static Object invokeNoArgsConstructor(String className) {
193 try {
194 Class<?> cls = Class.forName(className);
195 Constructor<?> constructor = cls.getConstructor();
196 if (!constructor.isAccessible())
197 constructor.setAccessible(true);
198 return constructor.newInstance();
199 } catch (ClassNotFoundException e) {
200 throw new RuntimeException(e);
201 } catch (SecurityException e) {
202 throw new RuntimeException(e);
203 } catch (NoSuchMethodException e) {
204 throw new RuntimeException(e);
205 } catch (IllegalArgumentException e) {
206 throw new RuntimeException(e);
207 } catch (InstantiationException e) {
208 throw new RuntimeException(e);
209 } catch (IllegalAccessException e) {
210 throw new RuntimeException(e);
211 } catch (InvocationTargetException e) {
212 throw new RuntimeException(e);
213 }
214 }
215
216 /**
217 * This process(JCas) method cycles through all annotations in the CAS. For those that are
218 * identified as tokens by {@link AnnotationDataExtractor} implementation being used, an attempt
219 * is made to extract part-of-speech information. The covered text for each token is then
220 * lemmatized using the {@link BioLemmatizer}, using the part-of-speech information if it was
221 * available. Results from the {@link BioLemmatizer} are added to the CAS via the specified
222 * {@link LemmaDecorator} implementation.
223 *
224 * @see org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas)
225 */
226 @Override
227 public void process(JCas jCas) throws AnalysisEngineProcessException {
228 for (Iterator<Annotation> annotIter = jCas.getJFSIndexRepository().getAnnotationIndex().iterator(); annotIter
229 .hasNext();) {
230 Annotation annotation = annotIter.next();
231 String annotationType = annotationDataExtractor.getAnnotationType(annotation);
232 if (annotationType != null && annotationType.equals(tokenTypeName)) {
233 List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> posTags = getPartOfSpeechTags(annotation);
234 if (posTags == null || posTags.isEmpty()) {
235 runBioLemmatizer(annotation, null);
236 } else {
237 for (edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech posTag : posTags) {
238 runBioLemmatizer(annotation, posTag.getPosTag());
239 }
240 }
241 }
242 }
243 }
244
245 /**
246 * This method uses the {@link BioLemmatizer} to lemmatize the covered text of the input
247 * {@link Annotation}. The lemma is added to the CAS via the {@link LemmaDecorator}
248 * implementation specified in this AE's configuration.
249 *
250 * @param annotation
251 * @param posTag
252 */
253 private void runBioLemmatizer(Annotation annotation, String posTag) {
254 String coveredText = annotationDataExtractor.getCoveredText(annotation);
255 LemmataEntry lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(coveredText, posTag);
256 Annotation lemmaAnnot = lemmaDecorator.getAnnotationToDecorate(annotation, annotationDataExtractor);
257 for (edu.ucdenver.ccp.nlp.biolemmatizer.LemmataEntry.Lemma lemma : lemmata.getLemmas()) {
258 lemmaDecorator.insertLemma(lemmaAnnot,
259 new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.Lemma(lemma.getLemma(),
260 new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech(lemma.getPos(),
261 lemma.getTagSetName())));
262 }
263 }
264
265 /**
266 * This method defaults to using the {@link PartOfSpeechDecorator} instance if there is one
267 * initialized. If not available, it will try to use the getPosMethod specified in the
268 * configuration. If neither are available, it is assumed that there is no input part-of-speech
269 * info and null is returned.
270 *
271 * @param annotation
272 *
273 * @return the POS tag as extracted from the input {@link Annotation}
274 */
275 private List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> getPartOfSpeechTags(
276 Annotation annotation) {
277 if (posDecorator != null) {
278 return posDecorator.extractPartsOfSpeech(annotation);
279 }
280 if (tokenGetPosMethodName != null) {
281 List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> posTagList = new ArrayList<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech>();
282 posTagList.add(getPosTagUsingSpecifiedMethodName(annotation));
283 return posTagList;
284 }
285 return null;
286 }
287
288 /**
289 * If the getPosTag method name is specified (and if no PartOfSpeechDecorator is specified) then
290 * this method is used to call the getPosTag method on the input {@link Annotation}.
291 *
292 * @param annotation
293 * @return the {@link edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech}
294 * extracted from the input {@link Annotation} using the specified
295 * {@link #tokenGetPosMethodName}.
296 */
297 private edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech getPosTagUsingSpecifiedMethodName(
298 Annotation annotation) {
299 try {
300 Method method = annotation.getClass().getDeclaredMethod(tokenGetPosMethodName);
301 String posTag = method.invoke(annotation).toString();
302 return new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech(posTag, null);
303 } catch (NoSuchMethodException e) {
304 throw new IllegalArgumentException(
305 "Error while attempting to retrieve part-of-speech information from class: "
306 + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e);
307 } catch (IllegalAccessException e) {
308 throw new IllegalArgumentException(
309 "Error while attempting to retrieve part-of-speech information from class: "
310 + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e);
311 } catch (InvocationTargetException e) {
312 throw new IllegalArgumentException(
313 "Error while attempting to retrieve part-of-speech information from class: "
314 + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e);
315 }
316 }
317
318 /**
319 * Initializes an {@link AnalysisEngine} that will determine lemmas for tokens that are present
320 * in the {@link CAS}
321 *
322 * @param tsd
323 * @param tokenClass
324 * @param tokenGetPosMethodName
325 * @param annotationDataExtractorClass
326 * @param lemmaDecoratorClass
327 * @return
328 * @throws ResourceInitializationException
329 *
330 */
331 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
332 Class<? extends Annotation> tokenClass, String tokenGetPosMethodName,
333 Class<? extends AnnotationDataExtractor> annotationDataExtractorClass,
334 Class<? extends LemmaDecorator> lemmaDecoratorClass) throws ResourceInitializationException {
335 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
336 tokenClass.getName(), PARAM_TOKEN_GET_POS_METHOD_NAME, tokenGetPosMethodName,
337 PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS, annotationDataExtractorClass.getName(),
338 PARAM_LEMMA_DECORATOR_CLASS, lemmaDecoratorClass.getName());
339 }
340
341 /**
342 * @param tsd
343 * @param tokenType
344 * @param partOfSpeechDecoratorClass
345 * @param annotationDataExtractorClass
346 * @param lemmaDecoratorClass
347 * @return
348 * @throws ResourceInitializationException
349 */
350 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
351 String tokenType, Class<? extends PartOfSpeechDecorator> partOfSpeechDecoratorClass,
352 Class<? extends AnnotationDataExtractor> annotationDataExtractorClass,
353 Class<? extends LemmaDecorator> lemmaDecoratorClass) throws ResourceInitializationException {
354 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
355 tokenType, PARAM_POS_DECORATOR_CLASS, partOfSpeechDecoratorClass.getName(),
356 PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS, annotationDataExtractorClass.getName(),
357 PARAM_LEMMA_DECORATOR_CLASS, lemmaDecoratorClass.getName());
358 }
359
360 /**
361 * @param tsd
362 * @param tokenClass
363 * @return
364 * @throws ResourceInitializationException
365 */
366 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
367 Class<? extends Annotation> tokenClass) throws ResourceInitializationException {
368 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
369 tokenClass.getName());
370 }
371
372 /**
373 * @param tsd
374 * @param tokenClass
375 * @param tokenGetPosMethodName
376 * @return
377 * @throws ResourceInitializationException
378 */
379 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
380 Class<? extends Annotation> tokenClass, String tokenGetPosMethodName)
381 throws ResourceInitializationException {
382 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
383 tokenClass.getName(), PARAM_TOKEN_GET_POS_METHOD_NAME, tokenGetPosMethodName);
384 }
385
386 }